29 November, 2020

Data manipulation

Manipulation functions (verbs)

Task Function Package
Sorting arrange() dplyr
Adding columns mutate() dplyr
Transformations mutate() dplyr
Re-ordering factor levels factor(,levels=) base
Re-labelling factor(,lab=) base
recode() dplyr
Re-naming columns rename(,replace=) dplyr
Filtering/Subsetting indexing base
~ columns select(,...) dplyr
pull(,...) dplyr
~ rows filter(,...) dplyr
Unique combinations distinct() dplyr
Reshaping data pivot_longer(), pivot_wider() tidyr
Split/combine columns separate(), unite() tidyr
Aggregating group_by() summarise() dplyr
group_by() count() dplyr
Merging/joining *_join() dplyr

Data manipulation grammar

Piping

  • %>%
data %>%
    select(...) %>%
        group_by(...) %>%
            summarise(...)

Data files

load(file = "../data/manipulationDatasets.RData")

data.1

Between Plot Cond Time Temp LAT LONG
A1 P1 H 1 25.78 14.95 144.7
A1 P1 M 2 24.33 16.17 142.1
A1 P1 L 3 24.96 15.53 144
A1 P2 H 4 25.73 14.96 145.8
A1 P2 M 1 25.05 15.61 147.7
A1 P2 L 2 24.88 15.74 144.8
A2 P3 H 3 20.98 19.71 145.8
A2 P3 M 4 21.39 19.27 144.9
A2 P3 L 1 20.34 20.17 142.2
A2 P4 H 2 20.49 19.61 144.2
A2 P4 M 3 21.52 19.11 144.2
A2 P4 L 4 22.18 18.31 144.9

Data manipulation packages

library(dplyr)
library(tidyr)
#OR better still
library(tidyverse)

Data files

head(data.1)
## # A tibble: 6 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
## 3 A1      P1    L         3  25.0  15.5  144.
## 4 A1      P2    H         4  25.7  15.0  146.
## 5 A1      P2    M         1  25.1  15.6  148.
## 6 A1      P2    L         2  24.9  15.7  145.
#OR
data.1 %>% head()
## # A tibble: 6 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
## 3 A1      P1    L         3  25.0  15.5  144.
## 4 A1      P2    H         4  25.7  15.0  146.
## 5 A1      P2    M         1  25.1  15.6  148.
## 6 A1      P2    L         2  24.9  15.7  145.
#OR
data.1 %>% head
## # A tibble: 6 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
## 3 A1      P1    L         3  25.0  15.5  144.
## 4 A1      P2    H         4  25.7  15.0  146.
## 5 A1      P2    M         1  25.1  15.6  148.
## 6 A1      P2    L         2  24.9  15.7  145.

Data files

summary(data.1)
##  Between Plot   Cond       Time           Temp            LAT             LONG      
##  A1:6    P1:3   H:4   Min.   :1.00   Min.   :20.34   Min.   :14.95   Min.   :142.1  
##  A2:6    P2:3   L:4   1st Qu.:1.75   1st Qu.:21.29   1st Qu.:15.59   1st Qu.:144.1  
##          P3:3   M:4   Median :2.50   Median :23.25   Median :17.24   Median :144.7  
##          P4:3         Mean   :2.50   Mean   :23.14   Mean   :17.43   Mean   :144.6  
##                       3rd Qu.:3.25   3rd Qu.:24.99   3rd Qu.:19.35   3rd Qu.:145.1  
##                       Max.   :4.00   Max.   :25.78   Max.   :20.17   Max.   :147.7

Data files

summary(data.1)
##  Between Plot   Cond       Time           Temp            LAT             LONG      
##  A1:6    P1:3   H:4   Min.   :1.00   Min.   :20.34   Min.   :14.95   Min.   :142.1  
##  A2:6    P2:3   L:4   1st Qu.:1.75   1st Qu.:21.29   1st Qu.:15.59   1st Qu.:144.1  
##          P3:3   M:4   Median :2.50   Median :23.25   Median :17.24   Median :144.7  
##          P4:3         Mean   :2.50   Mean   :23.14   Mean   :17.43   Mean   :144.6  
##                       3rd Qu.:3.25   3rd Qu.:24.99   3rd Qu.:19.35   3rd Qu.:145.1  
##                       Max.   :4.00   Max.   :25.78   Max.   :20.17   Max.   :147.7
data.1 %>% summary()
##  Between Plot   Cond       Time           Temp            LAT             LONG      
##  A1:6    P1:3   H:4   Min.   :1.00   Min.   :20.34   Min.   :14.95   Min.   :142.1  
##  A2:6    P2:3   L:4   1st Qu.:1.75   1st Qu.:21.29   1st Qu.:15.59   1st Qu.:144.1  
##          P3:3   M:4   Median :2.50   Median :23.25   Median :17.24   Median :144.7  
##          P4:3         Mean   :2.50   Mean   :23.14   Mean   :17.43   Mean   :144.6  
##                       3rd Qu.:3.25   3rd Qu.:24.99   3rd Qu.:19.35   3rd Qu.:145.1  
##                       Max.   :4.00   Max.   :25.78   Max.   :20.17   Max.   :147.7
data.1 %>% summary
##  Between Plot   Cond       Time           Temp            LAT             LONG      
##  A1:6    P1:3   H:4   Min.   :1.00   Min.   :20.34   Min.   :14.95   Min.   :142.1  
##  A2:6    P2:3   L:4   1st Qu.:1.75   1st Qu.:21.29   1st Qu.:15.59   1st Qu.:144.1  
##          P3:3   M:4   Median :2.50   Median :23.25   Median :17.24   Median :144.7  
##          P4:3         Mean   :2.50   Mean   :23.14   Mean   :17.43   Mean   :144.6  
##                       3rd Qu.:3.25   3rd Qu.:24.99   3rd Qu.:19.35   3rd Qu.:145.1  
##                       Max.   :4.00   Max.   :25.78   Max.   :20.17   Max.   :147.7

Data files

str(data.1)
## tibble [12 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Between: Factor w/ 2 levels "A1","A2": 1 1 1 1 1 1 2 2 2 2 ...
##  $ Plot   : Factor w/ 4 levels "P1","P2","P3",..: 1 1 1 2 2 2 3 3 3 4 ...
##  $ Cond   : Factor w/ 3 levels "H","L","M": 1 3 2 1 3 2 1 3 2 1 ...
##  $ Time   : int [1:12] 1 2 3 4 1 2 3 4 1 2 ...
##  $ Temp   : num [1:12] 25.8 24.3 25 25.7 25.1 ...
##  $ LAT    : num [1:12] 14.9 16.2 15.5 15 15.6 ...
##  $ LONG   : num [1:12] 145 142 144 146 148 ...

Dense summary

glimpse(data.1)
## Rows: 12
## Columns: 7
## $ Between <fct> A1, A1, A1, A1, A1, A1, A2, A2, A2, A2, A2, A2
## $ Plot    <fct> P1, P1, P1, P2, P2, P2, P3, P3, P3, P4, P4, P4
## $ Cond    <fct> H, M, L, H, M, L, H, M, L, H, M, L
## $ Time    <int> 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
## $ Temp    <dbl> 25.77507, 24.32564, 24.96428, 25.73127, 25.05280, 24.88189, 20.97769, 2…
## $ LAT     <dbl> 14.94992, 16.16537, 15.53248, 14.95750, 15.61145, 15.73689, 19.70611, 1…
## $ LONG    <dbl> 144.6884, 142.0585, 144.0437, 145.8359, 147.7174, 144.7944, 145.7753, 1…

Dataframes and tibbles

Dataframes and tibbles

data.1 %>% as_tibble
## # A tibble: 12 x 7
##    Between Plot  Cond   Time  Temp   LAT  LONG
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.
##  2 A1      P1    M         2  24.3  16.2  142.
##  3 A1      P1    L         3  25.0  15.5  144.
##  4 A1      P2    H         4  25.7  15.0  146.
##  5 A1      P2    M         1  25.1  15.6  148.
##  6 A1      P2    L         2  24.9  15.7  145.
##  7 A2      P3    H         3  21.0  19.7  146.
##  8 A2      P3    M         4  21.4  19.3  145.
##  9 A2      P3    L         1  20.3  20.2  142.
## 10 A2      P4    H         2  20.5  19.6  144.
## 11 A2      P4    M         3  21.5  19.1  144.
## 12 A2      P4    L         4  22.2  18.3  145.

Sorting data




Sorting data (arrange)

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Sorting by LAT

data.1 %>% arrange(LAT)
## # A tibble: 12 x 7
##    Between Plot  Cond   Time  Temp   LAT  LONG
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.
##  2 A1      P2    H         4  25.7  15.0  146.
##  3 A1      P1    L         3  25.0  15.5  144.
##  4 A1      P2    M         1  25.1  15.6  148.
##  5 A1      P2    L         2  24.9  15.7  145.
##  6 A1      P1    M         2  24.3  16.2  142.
##  7 A2      P4    L         4  22.2  18.3  145.
##  8 A2      P4    M         3  21.5  19.1  144.
##  9 A2      P3    M         4  21.4  19.3  145.
## 10 A2      P4    H         2  20.5  19.6  144.
## 11 A2      P3    H         3  21.0  19.7  146.
## 12 A2      P3    L         1  20.3  20.2  142.

Sorting data (arrange)

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Sorting by LAT (descending order)

data.1 %>% arrange(-LAT)
## # A tibble: 12 x 7
##    Between Plot  Cond   Time  Temp   LAT  LONG
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
##  1 A2      P3    L         1  20.3  20.2  142.
##  2 A2      P3    H         3  21.0  19.7  146.
##  3 A2      P4    H         2  20.5  19.6  144.
##  4 A2      P3    M         4  21.4  19.3  145.
##  5 A2      P4    M         3  21.5  19.1  144.
##  6 A2      P4    L         4  22.2  18.3  145.
##  7 A1      P1    M         2  24.3  16.2  142.
##  8 A1      P2    L         2  24.9  15.7  145.
##  9 A1      P2    M         1  25.1  15.6  148.
## 10 A1      P1    L         3  25.0  15.5  144.
## 11 A1      P2    H         4  25.7  15.0  146.
## 12 A1      P1    H         1  25.8  14.9  145.

Sorting data (arrange)

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Sorting by Cond and then TEMP

data.1 %>% arrange(Cond,Temp)
## # A tibble: 12 x 7
##    Between Plot  Cond   Time  Temp   LAT  LONG
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
##  1 A2      P4    H         2  20.5  19.6  144.
##  2 A2      P3    H         3  21.0  19.7  146.
##  3 A1      P2    H         4  25.7  15.0  146.
##  4 A1      P1    H         1  25.8  14.9  145.
##  5 A2      P3    L         1  20.3  20.2  142.
##  6 A2      P4    L         4  22.2  18.3  145.
##  7 A1      P2    L         2  24.9  15.7  145.
##  8 A1      P1    L         3  25.0  15.5  144.
##  9 A2      P3    M         4  21.4  19.3  145.
## 10 A2      P4    M         3  21.5  19.1  144.
## 11 A1      P1    M         2  24.3  16.2  142.
## 12 A1      P2    M         1  25.1  15.6  148.

Sorting data (arrange)

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Sort by the sum of Temp and LAT

data.1 %>% arrange(Temp+LAT)
## # A tibble: 12 x 7
##    Between Plot  Cond   Time  Temp   LAT  LONG
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
##  1 A2      P4    H         2  20.5  19.6  144.
##  2 A2      P4    L         4  22.2  18.3  145.
##  3 A1      P1    M         2  24.3  16.2  142.
##  4 A1      P1    L         3  25.0  15.5  144.
##  5 A2      P3    L         1  20.3  20.2  142.
##  6 A1      P2    L         2  24.9  15.7  145.
##  7 A2      P4    M         3  21.5  19.1  144.
##  8 A2      P3    M         4  21.4  19.3  145.
##  9 A1      P2    M         1  25.1  15.6  148.
## 10 A2      P3    H         3  21.0  19.7  146.
## 11 A1      P2    H         4  25.7  15.0  146.
## 12 A1      P1    H         1  25.8  14.9  145.

Your turn

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
  • sort by Between and then Cond

Your turn

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
  • sort by Between and then Cond
data.1 %>% arrange(Between,Cond)
## # A tibble: 12 x 7
##    Between Plot  Cond   Time  Temp   LAT  LONG
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.
##  2 A1      P2    H         4  25.7  15.0  146.
##  3 A1      P1    L         3  25.0  15.5  144.
##  4 A1      P2    L         2  24.9  15.7  145.
##  5 A1      P1    M         2  24.3  16.2  142.
##  6 A1      P2    M         1  25.1  15.6  148.
##  7 A2      P3    H         3  21.0  19.7  146.
##  8 A2      P4    H         2  20.5  19.6  144.
##  9 A2      P3    L         1  20.3  20.2  142.
## 10 A2      P4    L         4  22.2  18.3  145.
## 11 A2      P3    M         4  21.4  19.3  145.
## 12 A2      P4    M         3  21.5  19.1  144.

Your turn

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
  • sort by Condition and then the ratio of Temp to LAT

Your turn

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
  • sort by Condition and then the ratio of Temp to LAT
data.1 %>% arrange(Cond,Temp/LAT)
## # A tibble: 12 x 7
##    Between Plot  Cond   Time  Temp   LAT  LONG
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
##  1 A2      P4    H         2  20.5  19.6  144.
##  2 A2      P3    H         3  21.0  19.7  146.
##  3 A1      P2    H         4  25.7  15.0  146.
##  4 A1      P1    H         1  25.8  14.9  145.
##  5 A2      P3    L         1  20.3  20.2  142.
##  6 A2      P4    L         4  22.2  18.3  145.
##  7 A1      P2    L         2  24.9  15.7  145.
##  8 A1      P1    L         3  25.0  15.5  144.
##  9 A2      P3    M         4  21.4  19.3  145.
## 10 A2      P4    M         3  21.5  19.1  144.
## 11 A1      P1    M         2  24.3  16.2  142.
## 12 A1      P2    M         1  25.1  15.6  148.

Adding columns




Mutate

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% mutate(LL=LAT+LONG)
## # A tibble: 12 x 8
##    Between Plot  Cond   Time  Temp   LAT  LONG    LL
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.  160.
##  2 A1      P1    M         2  24.3  16.2  142.  158.
##  3 A1      P1    L         3  25.0  15.5  144.  160.
##  4 A1      P2    H         4  25.7  15.0  146.  161.
##  5 A1      P2    M         1  25.1  15.6  148.  163.
##  6 A1      P2    L         2  24.9  15.7  145.  161.
##  7 A2      P3    H         3  21.0  19.7  146.  165.
##  8 A2      P3    M         4  21.4  19.3  145.  164.
##  9 A2      P3    L         1  20.3  20.2  142.  162.
## 10 A2      P4    H         2  20.5  19.6  144.  164.
## 11 A2      P4    M         3  21.5  19.1  144.  163.
## 12 A2      P4    L         4  22.2  18.3  145.  163.

Mutate

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Transformations

data.1 %>% mutate(logTemp=log(Temp))
## # A tibble: 12 x 8
##    Between Plot  Cond   Time  Temp   LAT  LONG logTemp
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>   <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.    3.25
##  2 A1      P1    M         2  24.3  16.2  142.    3.19
##  3 A1      P1    L         3  25.0  15.5  144.    3.22
##  4 A1      P2    H         4  25.7  15.0  146.    3.25
##  5 A1      P2    M         1  25.1  15.6  148.    3.22
##  6 A1      P2    L         2  24.9  15.7  145.    3.21
##  7 A2      P3    H         3  21.0  19.7  146.    3.04
##  8 A2      P3    M         4  21.4  19.3  145.    3.06
##  9 A2      P3    L         1  20.3  20.2  142.    3.01
## 10 A2      P4    H         2  20.5  19.6  144.    3.02
## 11 A2      P4    M         3  21.5  19.1  144.    3.07
## 12 A2      P4    L         4  22.2  18.3  145.    3.10

Mutate

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Centering

data.1 %>% mutate(MeanTemp=mean(Temp), cTemp=Temp-MeanTemp)
## OR if just want the centered variable..
#data.1 %>% mutate(cTemp=Temp-mean(Temp))
## # A tibble: 12 x 9
##    Between Plot  Cond   Time  Temp   LAT  LONG MeanTemp  cTemp
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>    <dbl>  <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.     23.1  2.64 
##  2 A1      P1    M         2  24.3  16.2  142.     23.1  1.19 
##  3 A1      P1    L         3  25.0  15.5  144.     23.1  1.83 
##  4 A1      P2    H         4  25.7  15.0  146.     23.1  2.60 
##  5 A1      P2    M         1  25.1  15.6  148.     23.1  1.92 
##  6 A1      P2    L         2  24.9  15.7  145.     23.1  1.75 
##  7 A2      P3    H         3  21.0  19.7  146.     23.1 -2.16 
##  8 A2      P3    M         4  21.4  19.3  145.     23.1 -1.74 
##  9 A2      P3    L         1  20.3  20.2  142.     23.1 -2.79 
## 10 A2      P4    H         2  20.5  19.6  144.     23.1 -2.65 
## 11 A2      P4    M         3  21.5  19.1  144.     23.1 -1.62 
## 12 A2      P4    L         4  22.2  18.3  145.     23.1 -0.957

Mutate

data.1 %>% head(2) %>% as_tibble
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Changing vector types (classes)

data.1 %>% mutate(Time=factor(Time)) %>% as_tibble
## # A tibble: 12 x 7
##    Between Plot  Cond  Time   Temp   LAT  LONG
##    <fct>   <fct> <fct> <fct> <dbl> <dbl> <dbl>
##  1 A1      P1    H     1      25.8  14.9  145.
##  2 A1      P1    M     2      24.3  16.2  142.
##  3 A1      P1    L     3      25.0  15.5  144.
##  4 A1      P2    H     4      25.7  15.0  146.
##  5 A1      P2    M     1      25.1  15.6  148.
##  6 A1      P2    L     2      24.9  15.7  145.
##  7 A2      P3    H     3      21.0  19.7  146.
##  8 A2      P3    M     4      21.4  19.3  145.
##  9 A2      P3    L     1      20.3  20.2  142.
## 10 A2      P4    H     2      20.5  19.6  144.
## 11 A2      P4    M     3      21.5  19.1  144.
## 12 A2      P4    L     4      22.2  18.3  145.

Mutate

data.1 %>% head(2) %>% as_tibble
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Changing factor labels

data.1 %>% mutate(Cond=fct_recode(Cond, High='H',  Medium='M' )) %>%
  as_tibble
## # A tibble: 12 x 7
##    Between Plot  Cond    Time  Temp   LAT  LONG
##    <fct>   <fct> <fct>  <int> <dbl> <dbl> <dbl>
##  1 A1      P1    High       1  25.8  14.9  145.
##  2 A1      P1    Medium     2  24.3  16.2  142.
##  3 A1      P1    L          3  25.0  15.5  144.
##  4 A1      P2    High       4  25.7  15.0  146.
##  5 A1      P2    Medium     1  25.1  15.6  148.
##  6 A1      P2    L          2  24.9  15.7  145.
##  7 A2      P3    High       3  21.0  19.7  146.
##  8 A2      P3    Medium     4  21.4  19.3  145.
##  9 A2      P3    L          1  20.3  20.2  142.
## 10 A2      P4    High       2  20.5  19.6  144.
## 11 A2      P4    Medium     3  21.5  19.1  144.
## 12 A2      P4    L          4  22.2  18.3  145.

Mutate

data.1 %>% head(2) %>% as_tibble
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Changing factor levels

data.1 %>% pull(Cond)
##  [1] H M L H M L H M L H M L
## Levels: H L M
data.1 %>% mutate(Cond=fct_relevel(Cond, c('L', 'M','H'))) %>%
                    as_tibble() %>% pull(Cond)
##  [1] H M L H M L H M L H M L
## Levels: L M H

Mutate

data.1 %>% head(2) %>% as_tibble
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Changing factor levels

data.1 %>% pull(Cond)
##  [1] H M L H M L H M L H M L
## Levels: H L M
data.1 %>% mutate(Cond=recode_factor(Cond, 'L'='Low', 'M'='Medium')) %>%
                    as_tibble() %>% pull(Cond)
##  [1] H      Medium Low    H      Medium Low    H      Medium Low    H      Medium Low   
## Levels: Low Medium H

Mutate

Window functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% mutate(leadTemp=lead(Temp), lagTemp=lag(Temp))
## # A tibble: 12 x 9
##    Between Plot  Cond   Time  Temp   LAT  LONG leadTemp lagTemp
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>    <dbl>   <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.     24.3    NA  
##  2 A1      P1    M         2  24.3  16.2  142.     25.0    25.8
##  3 A1      P1    L         3  25.0  15.5  144.     25.7    24.3
##  4 A1      P2    H         4  25.7  15.0  146.     25.1    25.0
##  5 A1      P2    M         1  25.1  15.6  148.     24.9    25.7
##  6 A1      P2    L         2  24.9  15.7  145.     21.0    25.1
##  7 A2      P3    H         3  21.0  19.7  146.     21.4    24.9
##  8 A2      P3    M         4  21.4  19.3  145.     20.3    21.0
##  9 A2      P3    L         1  20.3  20.2  142.     20.5    21.4
## 10 A2      P4    H         2  20.5  19.6  144.     21.5    20.3
## 11 A2      P4    M         3  21.5  19.1  144.     22.2    20.5
## 12 A2      P4    L         4  22.2  18.3  145.     NA      21.5

Mutate

Window functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Rank orders

data.1 %>% mutate(rankTime=min_rank(Time),
                  denseRankTime=dense_rank(Time))
## # A tibble: 12 x 9
##    Between Plot  Cond   Time  Temp   LAT  LONG rankTime denseRankTime
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>    <int>         <int>
##  1 A1      P1    H         1  25.8  14.9  145.        1             1
##  2 A1      P1    M         2  24.3  16.2  142.        4             2
##  3 A1      P1    L         3  25.0  15.5  144.        7             3
##  4 A1      P2    H         4  25.7  15.0  146.       10             4
##  5 A1      P2    M         1  25.1  15.6  148.        1             1
##  6 A1      P2    L         2  24.9  15.7  145.        4             2
##  7 A2      P3    H         3  21.0  19.7  146.        7             3
##  8 A2      P3    M         4  21.4  19.3  145.       10             4
##  9 A2      P3    L         1  20.3  20.2  142.        1             1
## 10 A2      P4    H         2  20.5  19.6  144.        4             2
## 11 A2      P4    M         3  21.5  19.1  144.        7             3
## 12 A2      P4    L         4  22.2  18.3  145.       10             4

Mutate

Window functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Rank orders

data.1 %>% mutate(rowTemp=row_number(Temp), rowTime=row_number(Time),
                  rankTime=min_rank(Time))
## # A tibble: 12 x 10
##    Between Plot  Cond   Time  Temp   LAT  LONG rowTemp rowTime rankTime
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>   <int>   <int>    <int>
##  1 A1      P1    H         1  25.8  14.9  145.      12       1        1
##  2 A1      P1    M         2  24.3  16.2  142.       7       4        4
##  3 A1      P1    L         3  25.0  15.5  144.       9       7        7
##  4 A1      P2    H         4  25.7  15.0  146.      11      10       10
##  5 A1      P2    M         1  25.1  15.6  148.      10       2        1
##  6 A1      P2    L         2  24.9  15.7  145.       8       5        4
##  7 A2      P3    H         3  21.0  19.7  146.       3       8        7
##  8 A2      P3    M         4  21.4  19.3  145.       4      11       10
##  9 A2      P3    L         1  20.3  20.2  142.       1       3        1
## 10 A2      P4    H         2  20.5  19.6  144.       2       6        4
## 11 A2      P4    M         3  21.5  19.1  144.       5       9        7
## 12 A2      P4    L         4  22.2  18.3  145.       6      12       10

Mutate

Window functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Rank of bins

data.1 %>% mutate(ntile(Temp,4))
## # A tibble: 12 x 8
##    Between Plot  Cond   Time  Temp   LAT  LONG `ntile(Temp, 4)`
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>            <int>
##  1 A1      P1    H         1  25.8  14.9  145.                4
##  2 A1      P1    M         2  24.3  16.2  142.                3
##  3 A1      P1    L         3  25.0  15.5  144.                3
##  4 A1      P2    H         4  25.7  15.0  146.                4
##  5 A1      P2    M         1  25.1  15.6  148.                4
##  6 A1      P2    L         2  24.9  15.7  145.                3
##  7 A2      P3    H         3  21.0  19.7  146.                1
##  8 A2      P3    M         4  21.4  19.3  145.                2
##  9 A2      P3    L         1  20.3  20.2  142.                1
## 10 A2      P4    H         2  20.5  19.6  144.                1
## 11 A2      P4    M         3  21.5  19.1  144.                2
## 12 A2      P4    L         4  22.2  18.3  145.                2

Mutate

Window functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Logical bins

data.1 %>% mutate(between(Temp,20,25))
## # A tibble: 12 x 8
##    Between Plot  Cond   Time  Temp   LAT  LONG `between(Temp, 20, 25)`
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl> <lgl>                  
##  1 A1      P1    H         1  25.8  14.9  145. FALSE                  
##  2 A1      P1    M         2  24.3  16.2  142. TRUE                   
##  3 A1      P1    L         3  25.0  15.5  144. TRUE                   
##  4 A1      P2    H         4  25.7  15.0  146. FALSE                  
##  5 A1      P2    M         1  25.1  15.6  148. FALSE                  
##  6 A1      P2    L         2  24.9  15.7  145. TRUE                   
##  7 A2      P3    H         3  21.0  19.7  146. TRUE                   
##  8 A2      P3    M         4  21.4  19.3  145. TRUE                   
##  9 A2      P3    L         1  20.3  20.2  142. TRUE                   
## 10 A2      P4    H         2  20.5  19.6  144. TRUE                   
## 11 A2      P4    M         3  21.5  19.1  144. TRUE                   
## 12 A2      P4    L         4  22.2  18.3  145. TRUE

Mutate

Window functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Categorical bins

data.1 %>% mutate(fTemp=ifelse(Temp<21, 'Low',
                     ifelse(between(Temp,21,25), 'Medium', 'High')))
## OR
data.1 %>% mutate(fTemp=case_when(Temp<21 ~ 'Low',
                               between(Temp, 21, 25) ~ 'Medium',
                               Temp>25 ~ 'High'))
## # A tibble: 12 x 8
##    Between Plot  Cond   Time  Temp   LAT  LONG fTemp 
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl> <chr> 
##  1 A1      P1    H         1  25.8  14.9  145. High  
##  2 A1      P1    M         2  24.3  16.2  142. Medium
##  3 A1      P1    L         3  25.0  15.5  144. Medium
##  4 A1      P2    H         4  25.7  15.0  146. High  
##  5 A1      P2    M         1  25.1  15.6  148. High  
##  6 A1      P2    L         2  24.9  15.7  145. Medium
##  7 A2      P3    H         3  21.0  19.7  146. Low   
##  8 A2      P3    M         4  21.4  19.3  145. Medium
##  9 A2      P3    L         1  20.3  20.2  142. Low   
## 10 A2      P4    H         2  20.5  19.6  144. Low   
## 11 A2      P4    M         3  21.5  19.1  144. Medium
## 12 A2      P4    L         4  22.2  18.3  145. Medium
## # A tibble: 12 x 8
##    Between Plot  Cond   Time  Temp   LAT  LONG fTemp 
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl> <chr> 
##  1 A1      P1    H         1  25.8  14.9  145. High  
##  2 A1      P1    M         2  24.3  16.2  142. Medium
##  3 A1      P1    L         3  25.0  15.5  144. Medium
##  4 A1      P2    H         4  25.7  15.0  146. High  
##  5 A1      P2    M         1  25.1  15.6  148. High  
##  6 A1      P2    L         2  24.9  15.7  145. Medium
##  7 A2      P3    H         3  21.0  19.7  146. Low   
##  8 A2      P3    M         4  21.4  19.3  145. Medium
##  9 A2      P3    L         1  20.3  20.2  142. Low   
## 10 A2      P4    H         2  20.5  19.6  144. Low   
## 11 A2      P4    M         3  21.5  19.1  144. Medium
## 12 A2      P4    L         4  22.2  18.3  145. Medium

Mutate

Window functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Categorical bins

data.1 %>% mutate(fTemp=cut(Temp, breaks=c(0,21,25,100),
                         labels=c('Low','Medium','High')))
## # A tibble: 12 x 8
##    Between Plot  Cond   Time  Temp   LAT  LONG fTemp 
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl> <fct> 
##  1 A1      P1    H         1  25.8  14.9  145. High  
##  2 A1      P1    M         2  24.3  16.2  142. Medium
##  3 A1      P1    L         3  25.0  15.5  144. Medium
##  4 A1      P2    H         4  25.7  15.0  146. High  
##  5 A1      P2    M         1  25.1  15.6  148. High  
##  6 A1      P2    L         2  24.9  15.7  145. Medium
##  7 A2      P3    H         3  21.0  19.7  146. Low   
##  8 A2      P3    M         4  21.4  19.3  145. Medium
##  9 A2      P3    L         1  20.3  20.2  142. Low   
## 10 A2      P4    H         2  20.5  19.6  144. Low   
## 11 A2      P4    M         3  21.5  19.1  144. Medium
## 12 A2      P4    L         4  22.2  18.3  145. Medium

Your turn

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Bin Latitude into North, Central and Southern based on Latitude

Your turn

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Bin Latitude into Northern, Central and Southern based on Latitude

Assuming even spread..

data.1 %>% mutate(Region = cut(LAT, breaks=3, 
                               labels=c('Northern', 'Centeral', 'Southern'))) 
## # A tibble: 12 x 8
##    Between Plot  Cond   Time  Temp   LAT  LONG Region  
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl> <fct>   
##  1 A1      P1    H         1  25.8  14.9  145. Northern
##  2 A1      P1    M         2  24.3  16.2  142. Northern
##  3 A1      P1    L         3  25.0  15.5  144. Northern
##  4 A1      P2    H         4  25.7  15.0  146. Northern
##  5 A1      P2    M         1  25.1  15.6  148. Northern
##  6 A1      P2    L         2  24.9  15.7  145. Northern
##  7 A2      P3    H         3  21.0  19.7  146. Southern
##  8 A2      P3    M         4  21.4  19.3  145. Southern
##  9 A2      P3    L         1  20.3  20.2  142. Southern
## 10 A2      P4    H         2  20.5  19.6  144. Southern
## 11 A2      P4    M         3  21.5  19.1  144. Southern
## 12 A2      P4    L         4  22.2  18.3  145. Centeral

Your turn

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Bin Latitude into Northern, Central and Southern based on Latitude

data.1 %>% mutate(Region = cut(LAT, breaks=3, 
                               labels=c('Northern', 'Centeral', 'Southern'))) %>%
  mutate(Between = fct_inorder(Region))
## # A tibble: 12 x 8
##    Between  Plot  Cond   Time  Temp   LAT  LONG Region  
##    <fct>    <fct> <fct> <int> <dbl> <dbl> <dbl> <fct>   
##  1 Northern P1    H         1  25.8  14.9  145. Northern
##  2 Northern P1    M         2  24.3  16.2  142. Northern
##  3 Northern P1    L         3  25.0  15.5  144. Northern
##  4 Northern P2    H         4  25.7  15.0  146. Northern
##  5 Northern P2    M         1  25.1  15.6  148. Northern
##  6 Northern P2    L         2  24.9  15.7  145. Northern
##  7 Southern P3    H         3  21.0  19.7  146. Southern
##  8 Southern P3    M         4  21.4  19.3  145. Southern
##  9 Southern P3    L         1  20.3  20.2  142. Southern
## 10 Southern P4    H         2  20.5  19.6  144. Southern
## 11 Southern P4    M         3  21.5  19.1  144. Southern
## 12 Centeral P4    L         4  22.2  18.3  145. Centeral

Summarising (aggregating) data




Summarise

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% summarise(MeanTemp=mean(Temp), VarTemp=var(Temp), N=n())
## # A tibble: 1 x 3
##   MeanTemp VarTemp     N
##      <dbl>   <dbl> <int>
## 1     23.1    4.66    12
SE <- function(x) sd(x)/sqrt(length(x))
data.1 %>% summarise(MeanTemp=mean(Temp), VarTemp=var(Temp), 
          SEM=SE(Temp))
## # A tibble: 1 x 3
##   MeanTemp VarTemp   SEM
##      <dbl>   <dbl> <dbl>
## 1     23.1    4.66 0.623

Summarise

Across versions

data.1 %>% summarise(across(c(Temp,LAT), list(Mean=mean, Var=var)))
## # A tibble: 1 x 4
##   Temp_Mean Temp_Var LAT_Mean LAT_Var
##       <dbl>    <dbl>    <dbl>   <dbl>
## 1      23.1     4.66     17.4    4.37
data.1 %>% summarise(across(where(is.numeric), list(Mean=mean, Var=var)))
## # A tibble: 1 x 8
##   Time_Mean Time_Var Temp_Mean Temp_Var LAT_Mean LAT_Var LONG_Mean LONG_Var
##       <dbl>    <dbl>     <dbl>    <dbl>    <dbl>   <dbl>     <dbl>    <dbl>
## 1       2.5     1.36      23.1     4.66     17.4    4.37      145.     2.32
data.1 %>% summarize( across(where(is.numeric),  mean),
          across(where(is.factor),  length))
## # A tibble: 1 x 7
##    Time  Temp   LAT  LONG Between  Plot  Cond
##   <dbl> <dbl> <dbl> <dbl>   <int> <int> <int>
## 1   2.5  23.1  17.4  145.      12    12    12

Summarise

data.1 %>% count(Cond)
## # A tibble: 3 x 2
##   Cond      n
##   <fct> <int>
## 1 H         4
## 2 L         4
## 3 M         4
data.1 %>% count(Cond,between(Temp,20,30))
## # A tibble: 3 x 3
##   Cond  `between(Temp, 20, 30)`     n
##   <fct> <lgl>                   <int>
## 1 H     TRUE                        4
## 2 L     TRUE                        4
## 3 M     TRUE                        4

Grouping (=aggregating)




Grouping

data.1 %>% head(6)
## # A tibble: 6 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
## 3 A1      P1    L         3  25.0  15.5  144.
## 4 A1      P2    H         4  25.7  15.0  146.
## 5 A1      P2    M         1  25.1  15.6  148.
## 6 A1      P2    L         2  24.9  15.7  145.
data.1 %>% group_by(Between,Plot) %>%
    summarise(Mean=mean(Temp))
## # A tibble: 4 x 3
## # Groups:   Between [2]
##   Between Plot   Mean
##   <fct>   <fct> <dbl>
## 1 A1      P1     25.0
## 2 A1      P2     25.2
## 3 A2      P3     20.9
## 4 A2      P4     21.4

Grouping

data.1 %>% head(6)
## # A tibble: 6 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
## 3 A1      P1    L         3  25.0  15.5  144.
## 4 A1      P2    H         4  25.7  15.0  146.
## 5 A1      P2    M         1  25.1  15.6  148.
## 6 A1      P2    L         2  24.9  15.7  145.
data.1 %>% group_by(Between,Plot) %>%
    summarise(Mean=mean(Temp), Var=var(Temp), N=n(),First=first(Temp))
## # A tibble: 4 x 6
## # Groups:   Between [2]
##   Between Plot   Mean   Var     N First
##   <fct>   <fct> <dbl> <dbl> <int> <dbl>
## 1 A1      P1     25.0 0.528     3  25.8
## 2 A1      P2     25.2 0.202     3  25.7
## 3 A2      P3     20.9 0.280     3  21.0
## 4 A2      P4     21.4 0.724     3  20.5

Grouping

mutate vs summarise

data.1 %>% group_by(Between,Plot) %>%
    summarise(Mean=mean(Temp))
## # A tibble: 4 x 3
## # Groups:   Between [2]
##   Between Plot   Mean
##   <fct>   <fct> <dbl>
## 1 A1      P1     25.0
## 2 A1      P2     25.2
## 3 A2      P3     20.9
## 4 A2      P4     21.4
data.1 %>% group_by(Between,Plot) %>%
    mutate(Mean=mean(Temp))
## # A tibble: 12 x 8
## # Groups:   Between, Plot [4]
##    Between Plot  Cond   Time  Temp   LAT  LONG  Mean
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.  25.0
##  2 A1      P1    M         2  24.3  16.2  142.  25.0
##  3 A1      P1    L         3  25.0  15.5  144.  25.0
##  4 A1      P2    H         4  25.7  15.0  146.  25.2
##  5 A1      P2    M         1  25.1  15.6  148.  25.2
##  6 A1      P2    L         2  24.9  15.7  145.  25.2
##  7 A2      P3    H         3  21.0  19.7  146.  20.9
##  8 A2      P3    M         4  21.4  19.3  145.  20.9
##  9 A2      P3    L         1  20.3  20.2  142.  20.9
## 10 A2      P4    H         2  20.5  19.6  144.  21.4
## 11 A2      P4    M         3  21.5  19.1  144.  21.4
## 12 A2      P4    L         4  22.2  18.3  145.  21.4

Grouping

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% group_by(Between,Plot) %>%
  mutate(Mean=mean(Temp), cTemp=Temp-Mean)
## # A tibble: 12 x 9
## # Groups:   Between, Plot [4]
##    Between Plot  Cond   Time  Temp   LAT  LONG  Mean   cTemp
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>   <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.  25.0  0.753 
##  2 A1      P1    M         2  24.3  16.2  142.  25.0 -0.696 
##  3 A1      P1    L         3  25.0  15.5  144.  25.0 -0.0574
##  4 A1      P2    H         4  25.7  15.0  146.  25.2  0.509 
##  5 A1      P2    M         1  25.1  15.6  148.  25.2 -0.169 
##  6 A1      P2    L         2  24.9  15.7  145.  25.2 -0.340 
##  7 A2      P3    H         3  21.0  19.7  146.  20.9  0.0746
##  8 A2      P3    M         4  21.4  19.3  145.  20.9  0.488 
##  9 A2      P3    L         1  20.3  20.2  142.  20.9 -0.562 
## 10 A2      P4    H         2  20.5  19.6  144.  21.4 -0.905 
## 11 A2      P4    M         3  21.5  19.1  144.  21.4  0.122 
## 12 A2      P4    L         4  22.2  18.3  145.  21.4  0.783

Grouping

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% group_by(Between,Plot) %>%
  summarise(across(everything(), mean))
## # A tibble: 4 x 7
## # Groups:   Between [2]
##   Between Plot   Cond  Time  Temp   LAT  LONG
##   <fct>   <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A1      P1       NA  2     25.0  15.5  144.
## 2 A1      P2       NA  2.33  25.2  15.4  146.
## 3 A2      P3       NA  2.67  20.9  19.7  144.
## 4 A2      P4       NA  3     21.4  19.0  144.

Grouping

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% select(-Cond,-Time) %>% group_by(Between,Plot) %>%
    summarise_all(list(mean))
## # A tibble: 4 x 5
## # Groups:   Between [2]
##   Between Plot   Temp   LAT  LONG
##   <fct>   <fct> <dbl> <dbl> <dbl>
## 1 A1      P1     25.0  15.5  144.
## 2 A1      P2     25.2  15.4  146.
## 3 A2      P3     20.9  19.7  144.
## 4 A2      P4     21.4  19.0  144.

Grouping

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% group_by(Between,Plot) %>%
    summarise(across(c(Temp, LAT, LONG), list(Mean=mean, SE=SE)))
## # A tibble: 4 x 8
## # Groups:   Between [2]
##   Between Plot  Temp_Mean Temp_SE LAT_Mean LAT_SE LONG_Mean LONG_SE
##   <fct>   <fct>     <dbl>   <dbl>    <dbl>  <dbl>     <dbl>   <dbl>
## 1 A1      P1         25.0   0.419     15.5  0.351      144.   0.791
## 2 A1      P2         25.2   0.259     15.4  0.242      146.   0.855
## 3 A2      P3         20.9   0.305     19.7  0.262      144.   1.06 
## 4 A2      P4         21.4   0.491     19.0  0.379      144.   0.231

Your turn

Calculate for each year, the mean abundance of Pocillopora damicornis

tikus[1:10,c(1:3,76:77)]
##     Psammocora contigua Psammocora digitata Pocillopora damicornis time rep
## V1                    0                   0                     79   81   1
## V2                    0                   0                     51   81   2
## V3                    0                   0                     42   81   3
## V4                    0                   0                     15   81   4
## V5                    0                   0                      9   81   5
## V6                    0                   0                     72   81   6
## V7                    0                   0                      0   81   7
## V8                    0                   0                     16   81   8
## V9                    0                   0                      0   81   9
## V10                   0                   0                     16   81  10

NOTE to operate on columns whose names contain special characters (including spaces), you must use `` (backticks).

tikus %>% arrange(`Pocillopora damicornis`)

Your turn

Calculate for each year, the mean abundance of Pocillopora damicornis

tikus %>% group_by(time) %>%
    summarise(MeanAbundance=mean(`Pocillopora damicornis`))
## # A tibble: 6 x 2
##   time  MeanAbundance
##   <fct>         <dbl>
## 1 81             30  
## 2 83              0  
## 3 84              0  
## 4 85              0  
## 5 87              1.8
## 6 88              4

Your turn

Calculate for each year, the number of samples as well as the mean and variance of ozone

nasa = as.data.frame(nasa)
head(nasa)
##        lat   long month year cloudhigh cloudlow cloudmid ozone pressure surftemp temperature
## 1 36.20000 -113.8     1 1995      26.0      7.5     34.5   304      835    272.7       272.1
## 2 33.70435 -113.8     1 1995      20.0     11.5     32.5   304      940    279.5       282.2
## 3 31.20870 -113.8     1 1995      16.0     16.5     26.0   298      960    284.7       285.2
## 4 28.71304 -113.8     1 1995      13.0     20.5     14.5   276      990    289.3       290.7
## 5 26.21739 -113.8     1 1995       7.5     26.0     10.5   274     1000    292.2       292.7
## 6 23.72174 -113.8     1 1995       8.0     30.0      9.5   264     1000    294.1       293.6

Your turn

Calculate for each year, the number of samples as well as the mean and variance of ozone

nasa %>% group_by(year) %>%
    summarise(N=n(),Mean=mean(ozone), Var=var(ozone))
## # A tibble: 6 x 4
##    year     N  Mean   Var
##   <int> <int> <dbl> <dbl>
## 1  1995  6912  264.  258.
## 2  1996  6912  267.  326.
## 3  1997  6912  266.  327.
## 4  1998  6912  267.  507.
## 5  1999  6912  270.  368.
## 6  2000  6912  269.  353.

Subset columns




Selecting columns (select)

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% select(Between,Plot,Cond,Time,Temp)
## # A tibble: 12 x 5
##    Between Plot  Cond   Time  Temp
##    <fct>   <fct> <fct> <int> <dbl>
##  1 A1      P1    H         1  25.8
##  2 A1      P1    M         2  24.3
##  3 A1      P1    L         3  25.0
##  4 A1      P2    H         4  25.7
##  5 A1      P2    M         1  25.1
##  6 A1      P2    L         2  24.9
##  7 A2      P3    H         3  21.0
##  8 A2      P3    M         4  21.4
##  9 A2      P3    L         1  20.3
## 10 A2      P4    H         2  20.5
## 11 A2      P4    M         3  21.5
## 12 A2      P4    L         4  22.2

Selecting columns (select)

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% select(-LAT,-LONG)
## # A tibble: 12 x 5
##    Between Plot  Cond   Time  Temp
##    <fct>   <fct> <fct> <int> <dbl>
##  1 A1      P1    H         1  25.8
##  2 A1      P1    M         2  24.3
##  3 A1      P1    L         3  25.0
##  4 A1      P2    H         4  25.7
##  5 A1      P2    M         1  25.1
##  6 A1      P2    L         2  24.9
##  7 A2      P3    H         3  21.0
##  8 A2      P3    M         4  21.4
##  9 A2      P3    L         1  20.3
## 10 A2      P4    H         2  20.5
## 11 A2      P4    M         3  21.5
## 12 A2      P4    L         4  22.2

Selecting columns (select)

helper functions

  • contains()
  • ends_with()
  • starts_with()
  • matches()
  • everything()
  • across()

must evaluate to TRUE/FALSE

Selecting columns (select)

helper functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% select(contains('L'))
## # A tibble: 12 x 3
##    Plot    LAT  LONG
##    <fct> <dbl> <dbl>
##  1 P1     14.9  145.
##  2 P1     16.2  142.
##  3 P1     15.5  144.
##  4 P2     15.0  146.
##  5 P2     15.6  148.
##  6 P2     15.7  145.
##  7 P3     19.7  146.
##  8 P3     19.3  145.
##  9 P3     20.2  142.
## 10 P4     19.6  144.
## 11 P4     19.1  144.
## 12 P4     18.3  145.

Selecting columns (select)

helper functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% select(starts_with('L'))
## # A tibble: 12 x 2
##      LAT  LONG
##    <dbl> <dbl>
##  1  14.9  145.
##  2  16.2  142.
##  3  15.5  144.
##  4  15.0  146.
##  5  15.6  148.
##  6  15.7  145.
##  7  19.7  146.
##  8  19.3  145.
##  9  20.2  142.
## 10  19.6  144.
## 11  19.1  144.
## 12  18.3  145.

Selecting columns (select)

helper functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% select(ends_with('t'))
## # A tibble: 12 x 2
##    Plot    LAT
##    <fct> <dbl>
##  1 P1     14.9
##  2 P1     16.2
##  3 P1     15.5
##  4 P2     15.0
##  5 P2     15.6
##  6 P2     15.7
##  7 P3     19.7
##  8 P3     19.3
##  9 P3     20.2
## 10 P4     19.6
## 11 P4     19.1
## 12 P4     18.3

Selecting columns (select)

helper functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% select(matches('^T[a-z]m.'))
## # A tibble: 12 x 2
##     Time  Temp
##    <int> <dbl>
##  1     1  25.8
##  2     2  24.3
##  3     3  25.0
##  4     4  25.7
##  5     1  25.1
##  6     2  24.9
##  7     3  21.0
##  8     4  21.4
##  9     1  20.3
## 10     2  20.5
## 11     3  21.5
## 12     4  22.2

Regular expressions (regexp)

Selecting columns (select)

helper functions

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% select(Between:Temp)
## # A tibble: 12 x 5
##    Between Plot  Cond   Time  Temp
##    <fct>   <fct> <fct> <int> <dbl>
##  1 A1      P1    H         1  25.8
##  2 A1      P1    M         2  24.3
##  3 A1      P1    L         3  25.0
##  4 A1      P2    H         4  25.7
##  5 A1      P2    M         1  25.1
##  6 A1      P2    L         2  24.9
##  7 A2      P3    H         3  21.0
##  8 A2      P3    M         4  21.4
##  9 A2      P3    L         1  20.3
## 10 A2      P4    H         2  20.5
## 11 A2      P4    M         3  21.5
## 12 A2      P4    L         4  22.2

Your turn

nasa %>% head()
##        lat   long month year cloudhigh cloudlow cloudmid ozone pressure surftemp temperature
## 1 36.20000 -113.8     1 1995      26.0      7.5     34.5   304      835    272.7       272.1
## 2 33.70435 -113.8     1 1995      20.0     11.5     32.5   304      940    279.5       282.2
## 3 31.20870 -113.8     1 1995      16.0     16.5     26.0   298      960    284.7       285.2
## 4 28.71304 -113.8     1 1995      13.0     20.5     14.5   276      990    289.3       290.7
## 5 26.21739 -113.8     1 1995       7.5     26.0     10.5   274     1000    292.2       292.7
## 6 23.72174 -113.8     1 1995       8.0     30.0      9.5   264     1000    294.1       293.6

Select lat, long, and cloud.. columns

Your turn

nasa %>% head()
##        lat   long month year cloudhigh cloudlow cloudmid ozone pressure surftemp temperature
## 1 36.20000 -113.8     1 1995      26.0      7.5     34.5   304      835    272.7       272.1
## 2 33.70435 -113.8     1 1995      20.0     11.5     32.5   304      940    279.5       282.2
## 3 31.20870 -113.8     1 1995      16.0     16.5     26.0   298      960    284.7       285.2
## 4 28.71304 -113.8     1 1995      13.0     20.5     14.5   276      990    289.3       290.7
## 5 26.21739 -113.8     1 1995       7.5     26.0     10.5   274     1000    292.2       292.7
## 6 23.72174 -113.8     1 1995       8.0     30.0      9.5   264     1000    294.1       293.6
nasa %>% select(lat, long, starts_with("cloud")) %>% head
##        lat   long cloudhigh cloudlow cloudmid
## 1 36.20000 -113.8      26.0      7.5     34.5
## 2 33.70435 -113.8      20.0     11.5     32.5
## 3 31.20870 -113.8      16.0     16.5     26.0
## 4 28.71304 -113.8      13.0     20.5     14.5
## 5 26.21739 -113.8       7.5     26.0     10.5
## 6 23.72174 -113.8       8.0     30.0      9.5

Your turn

tikus[1:10,c(1:3,76:77)]
##     Psammocora contigua Psammocora digitata Pocillopora damicornis time rep
## V1                    0                   0                     79   81   1
## V2                    0                   0                     51   81   2
## V3                    0                   0                     42   81   3
## V4                    0                   0                     15   81   4
## V5                    0                   0                      9   81   5
## V6                    0                   0                     72   81   6
## V7                    0                   0                      0   81   7
## V8                    0                   0                     16   81   8
## V9                    0                   0                      0   81   9
## V10                   0                   0                     16   81  10

Select rep, time and only Species that DONT contain pora

Your turn

Select rep, time and only Species that DONT contain pora

tikas %>% dplyr::select(-contains('pora'))
## OR if we wanted to alter the order...
tikas %>% dplyr::select(rep, time, everything(),-contains('pora'))

Select awkward names

dplyr::select(tikus, `Pocillopora damicornis`)
##     Pocillopora damicornis
## V1                      79
## V2                      51
## V3                      42
## V4                      15
## V5                       9
## V6                      72
## V7                       0
## V8                      16
## V9                       0
## V10                     16
## V11                      0
## V12                      0
## V13                      0
## V14                      0
## V15                      0
## V16                      0
## V17                      0
## V18                      0
## V19                      0
## V20                      0
## V21                      0
## V22                      0
## V23                      0
## V24                      0
## V25                      0
## V26                      0
## V27                      0
## V28                      0
## V29                      0
## V30                      0
## V31                      0
## V32                      0
## V33                      0
## V34                      0
## V35                      0
## V36                      0
## V37                      0
## V38                      0
## V39                      0
## V40                      0
## V41                     18
## V42                      0
## V43                      0
## V44                      0
## V45                      0
## V46                      0
## V47                      0
## V48                      0
## V49                      0
## V50                      0
## V51                      0
## V52                      0
## V53                      0
## V54                      0
## V55                      0
## V56                      0
## V57                     10
## V58                      0
## V59                     30
## V60                      0

Selecting a single variable

data.1 %>% pull(Temp)
##  [1] 25.77507 24.32564 24.96428 25.73127 25.05280 24.88189 20.97769 21.39090 20.34081
## [10] 20.48899 21.51637 22.17791

Re-naming columns (vectors)

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% rename(Condition=Cond, Temperature=Temp)
## # A tibble: 12 x 7
##    Between Plot  Condition  Time Temperature   LAT  LONG
##    <fct>   <fct> <fct>     <int>       <dbl> <dbl> <dbl>
##  1 A1      P1    H             1        25.8  14.9  145.
##  2 A1      P1    M             2        24.3  16.2  142.
##  3 A1      P1    L             3        25.0  15.5  144.
##  4 A1      P2    H             4        25.7  15.0  146.
##  5 A1      P2    M             1        25.1  15.6  148.
##  6 A1      P2    L             2        24.9  15.7  145.
##  7 A2      P3    H             3        21.0  19.7  146.
##  8 A2      P3    M             4        21.4  19.3  145.
##  9 A2      P3    L             1        20.3  20.2  142.
## 10 A2      P4    H             2        20.5  19.6  144.
## 11 A2      P4    M             3        21.5  19.1  144.
## 12 A2      P4    L             4        22.2  18.3  145.

Filtering




Filtering

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% filter(Cond=='H')
## # A tibble: 4 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P2    H         4  25.7  15.0  146.
## 3 A2      P3    H         3  21.0  19.7  146.
## 4 A2      P4    H         2  20.5  19.6  144.
data.1 %>% filter(Cond %in% c('H','M'))
## # A tibble: 8 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
## 3 A1      P2    H         4  25.7  15.0  146.
## 4 A1      P2    M         1  25.1  15.6  148.
## 5 A2      P3    H         3  21.0  19.7  146.
## 6 A2      P3    M         4  21.4  19.3  145.
## 7 A2      P4    H         2  20.5  19.6  144.
## 8 A2      P4    M         3  21.5  19.1  144.

Filtering

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% filter(Cond=='H' & Temp<25)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A2      P3    H         3  21.0  19.7  146.
## 2 A2      P4    H         2  20.5  19.6  144.
data.1 %>% filter(Cond=='H' | Temp<25)
## # A tibble: 11 x 7
##    Between Plot  Cond   Time  Temp   LAT  LONG
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.
##  2 A1      P1    M         2  24.3  16.2  142.
##  3 A1      P1    L         3  25.0  15.5  144.
##  4 A1      P2    H         4  25.7  15.0  146.
##  5 A1      P2    L         2  24.9  15.7  145.
##  6 A2      P3    H         3  21.0  19.7  146.
##  7 A2      P3    M         4  21.4  19.3  145.
##  8 A2      P3    L         1  20.3  20.2  142.
##  9 A2      P4    H         2  20.5  19.6  144.
## 10 A2      P4    M         3  21.5  19.1  144.
## 11 A2      P4    L         4  22.2  18.3  145.

Your turn

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Keep only those rows with Temp less than 20 and LAT greater than 20 or LONG less than 145

Your turn

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.

Keep only those rows with Temp less than 20 and LAT greater than 20, or LONG less than 145

data.1 %>% filter(Temp<20 & (LAT>20 |  LONG <145))
## # A tibble: 0 x 7
## # … with 7 variables: Between <fct>, Plot <fct>, Cond <fct>, Time <int>, Temp <dbl>,
## #   LAT <dbl>, LONG <dbl>

Your turn

glimpse(nasa)
## Rows: 41,472
## Columns: 11
## $ lat         <dbl> 36.200000, 33.704348, 31.208696, 28.713043, 26.217391, 23.721739, 2…
## $ long        <dbl> -113.8000, -113.8000, -113.8000, -113.8000, -113.8000, -113.8000, -…
## $ month       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ year        <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1…
## $ cloudhigh   <dbl> 26.0, 20.0, 16.0, 13.0, 7.5, 8.0, 14.5, 19.5, 22.5, 21.0, 19.0, 16.…
## $ cloudlow    <dbl> 7.5, 11.5, 16.5, 20.5, 26.0, 30.0, 29.5, 26.5, 27.5, 26.0, 28.5, 28…
## $ cloudmid    <dbl> 34.5, 32.5, 26.0, 14.5, 10.5, 9.5, 11.0, 17.5, 18.5, 16.5, 12.5, 13…
## $ ozone       <dbl> 304, 304, 298, 276, 274, 264, 258, 252, 250, 250, 248, 248, 250, 24…
## $ pressure    <dbl> 835, 940, 960, 990, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,…
## $ surftemp    <dbl> 272.7, 279.5, 284.7, 289.3, 292.2, 294.1, 295.0, 298.3, 300.1, 300.…
## $ temperature <dbl> 272.1, 282.2, 285.2, 290.7, 292.7, 293.6, 294.6, 296.9, 297.8, 298.…

Filter to the largest ozone value for the second month of the last year

Your turn

Filter to the largest ozone value for the second month of the last year

nasa %>% filter(year==max(year) & month==2) %>% 
    arrange(-ozone) %>% head(5)
nasa %>% filter(year==max(year) & month==2) %>%
    arrange(-ozone) %>% slice(1:5)
##OR
nasa %>% filter(year==max(year) & month==2 ) %>%
    top_n(5, ozone)

Your turn

Filter to all ozone values between 320 and 325 in the first month of the last year

glimpse(nasa)
## Rows: 41,472
## Columns: 11
## $ lat         <dbl> 36.200000, 33.704348, 31.208696, 28.713043, 26.217391, 23.721739, 2…
## $ long        <dbl> -113.8000, -113.8000, -113.8000, -113.8000, -113.8000, -113.8000, -…
## $ month       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ year        <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1…
## $ cloudhigh   <dbl> 26.0, 20.0, 16.0, 13.0, 7.5, 8.0, 14.5, 19.5, 22.5, 21.0, 19.0, 16.…
## $ cloudlow    <dbl> 7.5, 11.5, 16.5, 20.5, 26.0, 30.0, 29.5, 26.5, 27.5, 26.0, 28.5, 28…
## $ cloudmid    <dbl> 34.5, 32.5, 26.0, 14.5, 10.5, 9.5, 11.0, 17.5, 18.5, 16.5, 12.5, 13…
## $ ozone       <dbl> 304, 304, 298, 276, 274, 264, 258, 252, 250, 250, 248, 248, 250, 24…
## $ pressure    <dbl> 835, 940, 960, 990, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,…
## $ surftemp    <dbl> 272.7, 279.5, 284.7, 289.3, 292.2, 294.1, 295.0, 298.3, 300.1, 300.…
## $ temperature <dbl> 272.1, 282.2, 285.2, 290.7, 292.7, 293.6, 294.6, 296.9, 297.8, 298.…

Your turn

Filter to all ozone values between 320 and 325 in the first month of the last year

nasa %>% filter(ozone > 320 & ozone<325, month==first(month),
       year==last(year))
##OR
nasa %>% filter(between(ozone,320,325), month==first(month),
       year==last(year))

Slicing

Filtering by row number

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% slice(1:4)
## # A tibble: 4 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
## 3 A1      P1    L         3  25.0  15.5  144.
## 4 A1      P2    H         4  25.7  15.0  146.
data.1 %>% slice(c(1:4,7))
## # A tibble: 5 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
## 3 A1      P1    L         3  25.0  15.5  144.
## 4 A1      P2    H         4  25.7  15.0  146.
## 5 A2      P3    H         3  21.0  19.7  146.

Sampling

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% sample_n(10, replace=TRUE)
## # A tibble: 10 x 7
##    Between Plot  Cond   Time  Temp   LAT  LONG
##    <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
##  1 A1      P1    H         1  25.8  14.9  145.
##  2 A1      P1    L         3  25.0  15.5  144.
##  3 A1      P1    L         3  25.0  15.5  144.
##  4 A2      P3    M         4  21.4  19.3  145.
##  5 A1      P2    L         2  24.9  15.7  145.
##  6 A2      P3    H         3  21.0  19.7  146.
##  7 A2      P4    L         4  22.2  18.3  145.
##  8 A1      P2    L         2  24.9  15.7  145.
##  9 A2      P3    M         4  21.4  19.3  145.
## 10 A2      P3    H         3  21.0  19.7  146.

Sampling

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
data.1 %>% sample_frac(0.5, replace=TRUE)
## # A tibble: 6 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A2      P4    M         3  21.5  19.1  144.
## 2 A1      P1    H         1  25.8  14.9  145.
## 3 A1      P2    H         4  25.7  15.0  146.
## 4 A2      P3    M         4  21.4  19.3  145.
## 5 A2      P3    L         1  20.3  20.2  142.
## 6 A2      P3    L         1  20.3  20.2  142.

Effects of filtering

data.1 %>% head(2)
## # A tibble: 2 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
#examine the levels of the Cond factor
levels(data.1$Cond)
## [1] "H" "L" "M"

Effects of filtering

#subset the dataset to just Cond H
data.3 <- data.1 %>% filter(Plot=='P1')
#examine subset data
data.3
## # A tibble: 3 x 7
##   Between Plot  Cond   Time  Temp   LAT  LONG
##   <fct>   <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 A1      P1    H         1  25.8  14.9  145.
## 2 A1      P1    M         2  24.3  16.2  142.
## 3 A1      P1    L         3  25.0  15.5  144.
#examine the levels of the Cond factor
levels(data.3$Cond)
## [1] "H" "L" "M"
levels(data.3$Plot)
## [1] "P1" "P2" "P3" "P4"
levels(data.3$Between)
## [1] "A1" "A2"

Effects of filtering

Correction - all factors

#subset the dataset to just Cond H
data.3 <-  data.1 %>% filter(Plot=='P1')
#drop the unused factor levels from all factors
data.3 <- data.3 %>% droplevels()
#examine the levels of each factor
levels(data.3$Cond)
## [1] "H" "L" "M"
levels(data.3$Plot)
## [1] "P1"
levels(data.3$Between)
## [1] "A1"

Effects of filtering

Correction - single factor

#subset the dataset to just Cond H
data.3 <- data.1 %>% filter(Plot=='P1')
#drop the unused factor levels from Cond
data.3 <- data.3 %>% mutate(Plot=factor(Plot))
#examine the levels of each factor
levels(data.3$Cond)
## [1] "H" "L" "M"
levels(data.3$Plot)
## [1] "P1"
levels(data.3$Between)
## [1] "A1" "A2"

Reshaping data




Reshaping data frames

Wide data

  Between Plot Time.0 Time.1 Time.2
R1 A1 P1 8 14 14
R2 A1 P2 10 12 11
R3 A2 P3 7 11 8
R4 A2 P4 11 9 2

Wide to long (melt)

data.w %>% pivot_longer(Time.0:Time.2,  names_to = 'Time',
                        values_to='Count')
## OR
data.w %>% pivot_longer(c(-Between, -Plot),  names_to = 'Time',
                        values_to='Count')
## # A tibble: 12 x 4
##    Between Plot  Time   Count
##    <fct>   <fct> <chr>  <int>
##  1 A1      P1    Time.0     8
##  2 A1      P1    Time.1    14
##  3 A1      P1    Time.2    14
##  4 A1      P2    Time.0    10
##  5 A1      P2    Time.1    12
##  6 A1      P2    Time.2    11
##  7 A2      P3    Time.0     7
##  8 A2      P3    Time.1    11
##  9 A2      P3    Time.2     8
## 10 A2      P4    Time.0    11
## 11 A2      P4    Time.1     9
## 12 A2      P4    Time.2     2
## # A tibble: 12 x 4
##    Between Plot  Time   Count
##    <fct>   <fct> <chr>  <int>
##  1 A1      P1    Time.0     8
##  2 A1      P1    Time.1    14
##  3 A1      P1    Time.2    14
##  4 A1      P2    Time.0    10
##  5 A1      P2    Time.1    12
##  6 A1      P2    Time.2    11
##  7 A2      P3    Time.0     7
##  8 A2      P3    Time.1    11
##  9 A2      P3    Time.2     8
## 10 A2      P4    Time.0    11
## 11 A2      P4    Time.1     9
## 12 A2      P4    Time.2     2

Reshaping data frames

Wide data

  Between Plot Time.0 Time.1 Time.2
R1 A1 P1 8 14 14
R2 A1 P2 10 12 11
R3 A2 P3 7 11 8
R4 A2 P4 11 9 2

Wide to long (melt)

## OR
data.w %>% pivot_longer(starts_with('Time'),  names_to = 'Time',
                        values_to='Count',
                        names_prefix='Time.')
## # A tibble: 12 x 4
##    Between Plot  Time  Count
##    <fct>   <fct> <chr> <int>
##  1 A1      P1    0         8
##  2 A1      P1    1        14
##  3 A1      P1    2        14
##  4 A1      P2    0        10
##  5 A1      P2    1        12
##  6 A1      P2    2        11
##  7 A2      P3    0         7
##  8 A2      P3    1        11
##  9 A2      P3    2         8
## 10 A2      P4    0        11
## 11 A2      P4    1         9
## 12 A2      P4    2         2

Reshaping data frames

Long data

Resp1 Resp2 Between Plot Subplot Within
8 17 A1 P1 S1 B1
10 18 A1 P1 S1 B2
7 17 A1 P1 S2 B1
11 21 A1 P1 S2 B2
14 19 A2 P2 S3 B1
12 13 A2 P2 S3 B2
11 24 A2 P2 S4 B1
9 18 A2 P2 S4 B2
14 25 A3 P3 S5 B1
11 18 A3 P3 S5 B2
8 27 A3 P3 S6 B1
2 22 A3 P3 S6 B2
8 17 A1 P4 S7 B1
10 22 A1 P4 S7 B2
7 16 A1 P4 S8 B1
12 13 A1 P4 S8 B2
11 23 A2 P5 S9 B1
12 19 A2 P5 S9 B2
12 23 A2 P5 S10 B1
10 21 A2 P5 S10 B2
3 17 A3 P6 S11 B1
11 16 A3 P6 S11 B2
13 26 A3 P6 S12 B1
7 28 A3 P6 S12 B2

Reshaping data frames

data %>% head(2)
##   Resp1 Resp2 Between Plot Subplot Within
## 1     8    17      A1   P1      S1     B1
## 2    10    18      A1   P1      S1     B2

Widen (cast)

Widen Resp1 for repeated measures (Within)

data %>% select(-Resp2) %>%
  pivot_wider(names_from=Within,  values_from=c(Resp1))
## # A tibble: 12 x 5
##    Between Plot  Subplot    B1    B2
##    <fct>   <fct> <fct>   <int> <int>
##  1 A1      P1    S1          8    10
##  2 A1      P1    S2          7    11
##  3 A2      P2    S3         14    12
##  4 A2      P2    S4         11     9
##  5 A3      P3    S5         14    11
##  6 A3      P3    S6          8     2
##  7 A1      P4    S7          8    10
##  8 A1      P4    S8          7    12
##  9 A2      P5    S9         11    12
## 10 A2      P5    S10        12    10
## 11 A3      P6    S11         3    11
## 12 A3      P6    S12        13     7

Reshaping data frames

Widen Resp1 and Resp2 for repeated measures (Within)

data %>% head(2)
##   Resp1 Resp2 Between Plot Subplot Within
## 1     8    17      A1   P1      S1     B1
## 2    10    18      A1   P1      S1     B2
data %>% pivot_wider(names_from=Within, values_from=c(Resp1, Resp2))
## # A tibble: 12 x 7
##    Between Plot  Subplot Resp1_B1 Resp1_B2 Resp2_B1 Resp2_B2
##    <fct>   <fct> <fct>      <int>    <int>    <int>    <int>
##  1 A1      P1    S1             8       10       17       18
##  2 A1      P1    S2             7       11       17       21
##  3 A2      P2    S3            14       12       19       13
##  4 A2      P2    S4            11        9       24       18
##  5 A3      P3    S5            14       11       25       18
##  6 A3      P3    S6             8        2       27       22
##  7 A1      P4    S7             8       10       17       22
##  8 A1      P4    S8             7       12       16       13
##  9 A2      P5    S9            11       12       23       19
## 10 A2      P5    S10           12       10       23       21
## 11 A3      P6    S11            3       11       17       16
## 12 A3      P6    S12           13        7       26       28

Combining data




Merging data frames

Bio data (missing Subplot 3)

  Resp1 Resp2 Between Plot Subplot
1 8 18 A1 P1 S1
2 10 21 A1 P1 S2
4 11 23 A1 P2 S4
5 14 22 A2 P3 S5
6 12 24 A2 P3 S6
7 11 23 A2 P4 S7
8 9 20 A2 P4 S8
9 14 11 A3 P5 S9
10 11 22 A3 P5 S10
11 8 24 A3 P6 S11
12 2 16 A3 P6 S12

Physio-chemical data (missing S7)

  Chem1 Chem2 Between Plot Subplot
1 1.453 0.8858 A1 P1 S1
2 3.266 0.18 A1 P1 S2
3 1.179 5.078 A1 P2 S3
4 13.4 1.576 A1 P2 S4
5 3.779 1.622 A2 P3 S5
6 1.197 4.237 A2 P3 S6
8 5.688 2.986 A2 P4 S8
9 4.835 4.133 A3 P5 S9
10 2.003 3.604 A3 P5 S10
11 12.33 1.776 A3 P6 S11
12 4.014 0.2255 A3 P6 S12

Merging data frames

Merge bio and chem data (only keep full matches - an inner join)

data.bio %>% inner_join(data.chem)
##    Resp1 Resp2 Between Plot Subplot     Chem1     Chem2
## 1      8    18      A1   P1      S1  1.452878 0.8858208
## 2     10    21      A1   P1      S2  3.266253 0.1800177
## 3     11    23      A1   P2      S4 13.400350 1.5762780
## 4     14    22      A2   P3      S5  3.779183 1.6222430
## 5     12    24      A2   P3      S6  1.196657 4.2369184
## 6      9    20      A2   P4      S8  5.687807 2.9859003
## 7     14    11      A3   P5      S9  4.834518 4.1328919
## 8     11    22      A3   P5     S10  2.002931 3.6043314
## 9      8    24      A3   P6     S11 12.326867 1.7763576
## 10     2    16      A3   P6     S12  4.014221 0.2255188
  • S3 and S7 absent

Merging data frames

Merge bio and chem data (keep all data - outer join)

data.bio %>% full_join(data.chem)
##    Resp1 Resp2 Between Plot Subplot     Chem1     Chem2
## 1      8    18      A1   P1      S1  1.452878 0.8858208
## 2     10    21      A1   P1      S2  3.266253 0.1800177
## 3     11    23      A1   P2      S4 13.400350 1.5762780
## 4     14    22      A2   P3      S5  3.779183 1.6222430
## 5     12    24      A2   P3      S6  1.196657 4.2369184
## 6     11    23      A2   P4      S7        NA        NA
## 7      9    20      A2   P4      S8  5.687807 2.9859003
## 8     14    11      A3   P5      S9  4.834518 4.1328919
## 9     11    22      A3   P5     S10  2.002931 3.6043314
## 10     8    24      A3   P6     S11 12.326867 1.7763576
## 11     2    16      A3   P6     S12  4.014221 0.2255188
## 12    NA    NA      A1   P2      S3  1.178652 5.0780682
  • note the order of Subplot

Merging data frames

Merge bio and chem data (only keep full BIO matches - left join)

data.bio %>% left_join(data.chem)
##    Resp1 Resp2 Between Plot Subplot     Chem1     Chem2
## 1      8    18      A1   P1      S1  1.452878 0.8858208
## 2     10    21      A1   P1      S2  3.266253 0.1800177
## 3     11    23      A1   P2      S4 13.400350 1.5762780
## 4     14    22      A2   P3      S5  3.779183 1.6222430
## 5     12    24      A2   P3      S6  1.196657 4.2369184
## 6     11    23      A2   P4      S7        NA        NA
## 7      9    20      A2   P4      S8  5.687807 2.9859003
## 8     14    11      A3   P5      S9  4.834518 4.1328919
## 9     11    22      A3   P5     S10  2.002931 3.6043314
## 10     8    24      A3   P6     S11 12.326867 1.7763576
## 11     2    16      A3   P6     S12  4.014221 0.2255188

Merging data frames

Merge bio and chem data (only keep full CHEM matches - right join)

data.bio %>% right_join(data.chem)
##    Resp1 Resp2 Between Plot Subplot     Chem1     Chem2
## 1      8    18      A1   P1      S1  1.452878 0.8858208
## 2     10    21      A1   P1      S2  3.266253 0.1800177
## 3     11    23      A1   P2      S4 13.400350 1.5762780
## 4     14    22      A2   P3      S5  3.779183 1.6222430
## 5     12    24      A2   P3      S6  1.196657 4.2369184
## 6      9    20      A2   P4      S8  5.687807 2.9859003
## 7     14    11      A3   P5      S9  4.834518 4.1328919
## 8     11    22      A3   P5     S10  2.002931 3.6043314
## 9      8    24      A3   P6     S11 12.326867 1.7763576
## 10     2    16      A3   P6     S12  4.014221 0.2255188
## 11    NA    NA      A1   P2      S3  1.178652 5.0780682

VLOOKUP

VLOOKUP

Biological data set (data.bio)

##    Resp1 Resp2 Between Plot Subplot
## 1      8    18      A1   P1      S1
## 2     10    21      A1   P1      S2
## 4     11    23      A1   P2      S4
## 5     14    22      A2   P3      S5
## 6     12    24      A2   P3      S6
## 7     11    23      A2   P4      S7
## 8      9    20      A2   P4      S8
## 9     14    11      A3   P5      S9
## 10    11    22      A3   P5     S10
## 11     8    24      A3   P6     S11
## 12     2    16      A3   P6     S12

Geographical data set (lookup table) (data.geo)

##   Plot     LAT     LONG
## 1   P1 17.9605 145.4326
## 2   P2 17.5210 146.1983
## 3   P3 17.0011 146.3839
## 4   P4 18.2350 146.7934
## 5   P5 18.9840 146.0345
## 6   P6 20.1154 146.4672

VLOOKUP

Incorporate (merge) the lat/longs into the bio data

data.bio %>% left_join(data.geo,by=c("Plot"))
##    Resp1 Resp2 Between Plot Subplot     LAT     LONG
## 1      8    18      A1   P1      S1 17.9605 145.4326
## 2     10    21      A1   P1      S2 17.9605 145.4326
## 3     11    23      A1   P2      S4 17.5210 146.1983
## 4     14    22      A2   P3      S5 17.0011 146.3839
## 5     12    24      A2   P3      S6 17.0011 146.3839
## 6     11    23      A2   P4      S7 18.2350 146.7934
## 7      9    20      A2   P4      S8 18.2350 146.7934
## 8     14    11      A3   P5      S9 18.9840 146.0345
## 9     11    22      A3   P5     S10 18.9840 146.0345
## 10     8    24      A3   P6     S11 20.1154 146.4672
## 11     2    16      A3   P6     S12 20.1154 146.4672

Applied examples

Tikus Island coral data

##     Psammocora contigua Psammocora digitata time rep
## V1                    0                   0   81   1
## V2                    0                   0   81   2
## V3                    0                   0   81   3
## V4                    0                   0   81   4
## V5                    0                   0   81   5
## V6                    0                   0   81   6
## V7                    0                   0   81   7
## V8                    0                   0   81   8
## V9                    0                   0   81   9
## V10                   0                   0   81  10
## Rows: 60
## Columns: 77
## $ `Psammocora contigua`    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Psammocora digitata`    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Pocillopora damicornis` <int> 79, 51, 42, 15, 9, 72, 0, 16, 0, 1…
## $ `Pocillopora verrucosa`  <int> 32, 21, 35, 0, 0, 0, 41, 25, 38, 0…
## $ `Stylopora pistillata`   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Acropora bruegemanni`   <int> 0, 44, 0, 11, 9, 10, 0, 0, 0, 37, …
## $ `Acropora robusta`       <int> 0, 35, 40, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Acropora grandis`       <int> 0, 0, 0, 0, 0, 0, 60, 0, 0, 0, 0, …
## $ `Acropora intermedia`    <int> 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Acropora formosa`       <int> 75, 0, 15, 0, 125, 0, 0, 0, 10, 0,…
## $ `Acropora splendida`     <int> 0, 22, 0, 31, 0, 9, 16, 0, 0, 20, …
## $ `Acropera aspera`        <int> 17, 18, 9, 8, 23, 0, 17, 13, 16, 1…
## $ `Acropora hyacinthus`    <int> 141, 34, 55, 54, 0, 0, 0, 0, 0, 0,…
## $ `Acropora palifera`      <int> 32, 0, 44, 0, 17, 0, 0, 0, 0, 0, 0…
## $ `Acropora cytherea`      <int> 108, 33, 14, 122, 0, 0, 0, 8, 0, 0…
## $ `Acropora tenuis`        <int> 0, 25, 0, 0, 0, 22, 28, 0, 0, 0, 0…
## $ `Acropora pulchra`       <int> 0, 0, 15, 52, 62, 33, 0, 0, 24, 0,…
## $ `Acropora nasuta`        <int> 43, 21, 19, 0, 0, 0, 10, 0, 0, 0, …
## $ `Acropora humilis`       <int> 31, 25, 0, 19, 0, 0, 0, 0, 0, 0, 0…
## $ `Acropora diversa`       <int> 22, 19, 20, 13, 23, 14, 0, 12, 12,…
## $ `Acropora digitifera`    <int> 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Acropora divaricata`    <int> 0, 32, 55, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Acropora subglabra`     <int> 51, 0, 0, 44, 15, 0, 0, 25, 0, 0, …
## $ `Acropora cerealis`      <int> 0, 75, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Acropora valida`        <int> 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, …
## $ `Acropora acuminata`     <int> 20, 0, 71, 0, 15, 0, 25, 25, 0, 0,…
## $ `Acropora elsevi`        <int> 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Acropora millepora`     <int> 17, 14, 0, 20, 0, 0, 0, 0, 0, 0, 0…
## $ `Montipora monasteriata` <int> 60, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Montipora tuberculosa`  <int> 0, 15, 15, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Montipora hispida`      <int> 0, 0, 0, 32, 40, 24, 0, 0, 0, 0, 0…
## $ `Montipora digitata`     <int> 0, 0, 0, 0, 0, 77, 84, 53, 71, 351…
## $ `Montipora foliosa`      <int> 0, 0, 0, 0, 50, 71, 62, 81, 24, 0,…
## $ `Montipora verrucosa`    <int> 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Fungia fungites`        <int> 0, 0, 18, 17, 0, 0, 0, 0, 0, 0, 0,…
## $ `Fungia paumotensis`     <int> 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 12,…
## $ `Fungia concina`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ `Fungia scutaria`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Halomitra limax`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Pavona varians`         <int> 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,…
## $ `Pavona venosa`          <int> 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Pavona cactus`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Coeloseris mayeri`      <int> 20, 0, 15, 0, 9, 19, 0, 0, 25, 0, …
## $ `Galaxea fascicularis`   <int> 51, 27, 31, 24, 0, 13, 0, 0, 0, 0,…
## $ `Symphyllia radians`     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lobophyllia corymbosa`  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ `Lobophyllia hemprichii` <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Porites cylindrica`     <int> 61, 24, 0, 20, 0, 0, 0, 0, 0, 0, 1…
## $ `Porites lichen`         <int> 0, 47, 49, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Porites lobata`         <int> 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Porites lutea`          <int> 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Porites nigrescens`     <int> 0, 0, 0, 21, 0, 9, 25, 0, 45, 26, …
## $ `Porites solida`         <int> 0, 0, 10, 0, 17, 0, 31, 41, 0, 0, …
## $ `Porites stephensoni`    <int> 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, …
## $ `Goniopora lobata`       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Favia pallida`          <int> 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Favia speciosa`         <int> 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Favia stelligera`       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Favia rotumana`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Favites abdita`         <int> 33, 41, 23, 27, 91, 63, 72, 48, 71…
## $ `Favites chinensis`      <int> 0, 44, 78, 61, 44, 0, 55, 30, 30, …
## $ `Goniastrea rectiformis` <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0…
## $ `Goniastrea pectinata`   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Goniastrea sp`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Dulophyllia crispa`     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9…
## $ `Platygyra daedalea`     <int> 0, 27, 55, 0, 71, 74, 55, 48, 0, 0…
## $ `Platygyra sinensis`     <int> 47, 27, 56, 26, 0, 0, 0, 0, 0, 0, …
## $ `Hydnopora rigida`       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Leptastrea purpurea`    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Leptastrea pruinosa`    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Cyphastrea serailia`    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, …
## $ `Millepora platyphylla`  <int> 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Millepora dichotoma`    <int> 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Millepora intrincata`   <int> 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Heliopora coerulea`     <int> 461, 271, 221, 154, 0, 0, 0, 0, 0,…
## $ time                     <fct> 81, 81, 81, 81, 81, 81, 81, 81, 81…
## $ rep                      <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, …

Tikus Island coral data

Explore/Process data

  • Convert abundance to cover
  • Mean cover of total Acropora per year
  • NOTE there is a typo ‘Acropera’

Tikus Island coral data

Explore/Process data

  • Convert abundance to cover (abundance is the length in cm of a 10m transect containing the species)
  • Mean cover of total Acropora per year
  • NOTE there is a typo ‘Acropera’

Step 1. fix typo (rename) - backticks

tikus %>% rename(`Acropora aspera`=`Acropera aspera`)

Tikus Island coral data

Explore/Process data

  • Convert abundance to cover
  • Mean cover of total Acropora per year
  • NOTE there is a typo ‘Acropera’

Step 2. melt data (gather)

tikus %>% rename(`Acropora aspera`=`Acropera aspera`) %>%
    gather(Species, Abundance,-time,-rep)

Tikus Island coral data

Explore/Process data

  • Convert abundance to cover
  • Mean cover of total Acropora per year
  • NOTE there is a typo ‘Acropera’

Step 3. Calculate Cover (mutate) (Abundance/10)

tikus %>% rename(`Acropora aspera`=`Acropera aspera`) %>%
    gather(Species, Abundance,-time,-rep) %>%
        mutate(Cover=Abundance/10)

Tikus Island coral data

Explore/Process data

  • Convert abundance to cover
  • Mean cover of total Acropora per year
  • NOTE there is a typo ‘Acropera’

Step 4. Split species into Genera and Species (separate)

tikus %>% rename(`Acropora aspera`=`Acropera aspera`) %>%
    gather(Species, Abundance,-time,-rep) %>%
        mutate(Cover=Abundance/10) %>%
            separate(Species,c('Genera','Species'))

Tikus Island coral data

Explore/Process data

  • Convert abundance to cover
  • Mean cover of total Acropora per year
  • NOTE there is a typo ‘Acropera’

Step 5. Subset just ‘Acropora’ (filter)

tikus %>% rename(`Acropora aspera`=`Acropera aspera`) %>%
    gather(Species, Abundance,-time,-rep) %>%
        mutate(Cover=Abundance/10) %>%
            separate(Species,c('Genera','Species')) %>%
                filter(Genera=='Acropora')

Tikus Island coral data

Explore/Process data

  • Convert abundance to cover
  • Mean cover of total Acropora per year
  • NOTE there is a typo ‘Acropera’

Step 6. Sum over all Species (group_by and summarise)

tikus %>% rename(`Acropora aspera`=`Acropera aspera`) %>%
    gather(Species, Abundance,-time,-rep) %>%
        mutate(Cover=Abundance/10) %>%
            separate(Species,c('Genera','Species')) %>%
                filter(Genera=='Acropora') %>%
                    group_by(time,rep) %>%
                        summarise(SumCover=sum(Cover))

Tikus Island coral data

Explore/Process data

  • Convert abundance to cover
  • Mean cover of total Acropora per year
  • NOTE there is a typo ‘Acropera’

Step 7. Summarise per year

tikus %>% rename(`Acropora aspera`=`Acropera aspera`) %>%
    gather(Species, Abundance,-time,-rep) %>%
        mutate(Cover=Abundance/10) %>%
            separate(Species,c('Genera','Species')) %>%
                filter(Genera=='Acropora') %>%
                    group_by(time,rep) %>%
                        summarise(SumCover=sum(Cover)) %>%
                            group_by(time) %>%
                                summarise(Mean=mean(SumCover),
                                          Var=var(SumCover))
## # A tibble: 6 x 3
##   time   Mean   Var
##   <fct> <dbl> <dbl>
## 1 81    25.6  383. 
## 2 83     0      0  
## 3 84     0      0  
## 4 85     2.43  14.2
## 5 87     8.01  68.5
## 6 88     8.55 106.

Tikus Island coral data

tikus %>% rename(`Acropora aspera`=`Acropera aspera`) %>%
    gather(Species, Abundance,-time,-rep) %>%
        mutate(Cover=Abundance/10) %>%
            separate(Species,c('Genera','Species')) %>%
                filter(Genera=='Acropora') %>%
                    group_by(time,rep) %>%
                        summarise(SumCover=sum(Cover)) %>%
                            group_by(time) %>%
                                summarise(Mean=mean(SumCover),
                                          Var=var(SumCover))
## # A tibble: 6 x 3
##   time   Mean   Var
##   <fct> <dbl> <dbl>
## 1 81    25.6  383. 
## 2 83     0      0  
## 3 84     0      0  
## 4 85     2.43  14.2
## 5 87     8.01  68.5
## 6 88     8.55 106.

Tikus Island coral data

Can you modify so that we get the means and var for each Genera per year?

tikus %>% rename(`Acropora aspera`=`Acropera aspera`) %>%
    gather(Species, Abundance,-time,-rep) %>%
        mutate(Cover=Abundance/10) %>%
            separate(Species,c('Genera','Species')) %>%
                    group_by(time,rep,Genera) %>%
                        summarise(SumCover=sum(Cover)) %>%
                            group_by(time,Genera) %>%
                                summarise(Mean=mean(SumCover),
                                          Var=var(SumCover))
## # A tibble: 144 x 4
## # Groups:   time [6]
##    time  Genera       Mean    Var
##    <fct> <chr>       <dbl>  <dbl>
##  1 81    Acropora    25.6  383.  
##  2 81    Coeloseris   0.88   1.02
##  3 81    Cyphastrea   0      0   
##  4 81    Dulophyllia  0      0   
##  5 81    Favia        0.6    1.16
##  6 81    Favites      8.22  14.9 
##  7 81    Fungia       0.68   1.38
##  8 81    Galaxea      1.46   3.23
##  9 81    Goniastrea   0      0   
## 10 81    Goniopora    0      0   
## # … with 134 more rows

Tikus Island coral data

What about the means and var for the top 3 Genera per year (sorted from highest to lowest)?

tikus %>% rename(`Acropora aspera`=`Acropera aspera`) %>%
    gather(Species, Abundance,-time,-rep) %>%
        mutate(Cover=Abundance/10) %>%
            separate(Species,c('Genera','Species')) %>%
                    group_by(time,rep,Genera) %>%
                        summarise(SumCover=sum(Cover)) %>%
                            group_by(time,Genera) %>%
                                summarise(Mean=mean(SumCover),
                                          Var=var(SumCover)) %>%
                                              top_n(3,Mean) %>%
                                                  arrange(desc(Mean))
## # A tibble: 18 x 4
## # Groups:   time [6]
##    time  Genera     Mean    Var
##    <fct> <chr>     <dbl>  <dbl>
##  1 87    Montipora 27.4  966.  
##  2 81    Acropora  25.6  383.  
##  3 85    Montipora 20.5  171.  
##  4 85    Porites   19.0   51.3 
##  5 88    Montipora 11.8  644.  
##  6 81    Montipora 11.4   95.7 
##  7 81    Heliopora 11.1  262.  
##  8 84    Montipora 11.0   70.5 
##  9 88    Porites    9.84  41.4 
## 10 88    Acropora   8.55 106.  
## 11 87    Acropora   8.01  68.5 
## 12 87    Porites    4.49  35.8 
## 13 84    Porites    2.94   6.65
## 14 85    Platygyra  2.55   8.74
## 15 83    Porites    1.74   2.07
## 16 84    Pavona     1.2    3.33
## 17 83    Fungia     1.14   3.64
## 18 83    Montipora  0.93   1.57